from dataidea.packages import * # imports np, pd, plt etc
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from dataidea.datasets import loadDatasetOverview of Machine Learning
demo_df = loadDataset('demo')cols = {"Age":"age", "Gender":"gender", "Marital Status":"marital_status", "Address":"address",
"Income":"income","Income Category":"income_category", "Job Category":"job_category",}
demo_df.rename(columns=cols, inplace=True)demo_df.columnsIndex(['age', 'gender', 'marital_status', 'address', 'income',
'income_category', 'job_category'],
dtype='object')
demo_df.describe() #will only give us numerical values| age | address | income | income_category | job_category | |
|---|---|---|---|---|---|
| count | 200.000000 | 200.000000 | 200.000000 | 200.000000 | 200.000000 |
| mean | 42.475000 | 11.485000 | 76.305000 | 2.520000 | 1.950000 |
| std | 12.801122 | 10.365665 | 107.554647 | 1.065493 | 0.781379 |
| min | 19.000000 | 0.000000 | 11.000000 | 1.000000 | 1.000000 |
| 25% | 32.000000 | 3.000000 | 27.000000 | 2.000000 | 1.000000 |
| 50% | 43.000000 | 9.000000 | 44.500000 | 2.000000 | 2.000000 |
| 75% | 51.000000 | 17.000000 | 76.000000 | 4.000000 | 3.000000 |
| max | 76.000000 | 51.000000 | 873.000000 | 4.000000 | 3.000000 |
demo_df.select_dtypes(include=["object"])| gender | marital_status | |
|---|---|---|
| 0 | f | 1 |
| 1 | m | 0 |
| 2 | f | no answer |
| 3 | m | 1 |
| 4 | m | no answer |
| ... | ... | ... |
| 195 | f | 0 |
| 196 | f | 1 |
| 197 | f | 1 |
| 198 | m | 0 |
| 199 | m | 0 |
200 rows × 2 columns
demo_df.select_dtypes(include=["object"]).describe()| gender | marital_status | |
|---|---|---|
| count | 200 | 200 |
| unique | 4 | 3 |
| top | f | 0 |
| freq | 99 | 102 |
demo_df["gender"].value_counts().indexIndex(['f', 'm', ' f', ' m'], dtype='object', name='gender')
demo_df.gender.unique()array(['f', 'm', ' f', ' m'], dtype=object)
demo_df2 = demo_df.replace(to_replace=" f", value="f")demo_df2.gender.unique()array(['f', 'm', ' m'], dtype=object)
gender_col = demo_df2.gender.replace(to_replace=" m", value="m")
gender_col0 f
1 m
2 f
3 m
4 m
..
195 f
196 f
197 f
198 m
199 m
Name: gender, Length: 200, dtype: object
gender_col.unique()array(['f', 'm'], dtype=object)
demo_df2["gender"] = gender_coldemo_df2.gender.unique()array(['f', 'm'], dtype=object)
demo_df2.marital_status.unique()array(['1', '0', 'no answer'], dtype=object)
demo_df2.marital_status.value_counts()marital_status
0 102
1 93
no answer 5
Name: count, dtype: int64
demo_df2.select_dtypes(include=["number"]) #"float64","int64"| age | address | income | income_category | job_category | |
|---|---|---|---|---|---|
| 0 | 55 | 12 | 72.0 | 3.0 | 3 |
| 1 | 56 | 29 | 153.0 | 4.0 | 3 |
| 2 | 28 | 9 | 28.0 | 2.0 | 1 |
| 3 | 24 | 4 | 26.0 | 2.0 | 1 |
| 4 | 25 | 2 | 23.0 | 1.0 | 2 |
| ... | ... | ... | ... | ... | ... |
| 195 | 45 | 3 | 86.0 | 4.0 | 3 |
| 196 | 23 | 2 | 27.0 | 2.0 | 1 |
| 197 | 66 | 32 | 11.0 | 1.0 | 2 |
| 198 | 49 | 4 | 30.0 | 2.0 | 1 |
| 199 | 45 | 1 | 147.0 | 4.0 | 3 |
200 rows × 5 columns
demo_df2.isna().sum()age 0
gender 0
marital_status 0
address 0
income 0
income_category 0
job_category 0
dtype: int64
plt.boxplot(demo_df2["income"]){'whiskers': [<matplotlib.lines.Line2D>,
<matplotlib.lines.Line2D>],
'caps': [<matplotlib.lines.Line2D>,
<matplotlib.lines.Line2D>],
'boxes': [<matplotlib.lines.Line2D>],
'medians': [<matplotlib.lines.Line2D>],
'fliers': [<matplotlib.lines.Line2D>],
'means': []}

#exercise: function to calucate outliers:
#lower fence = Q1 - 1.5(Q3-Q1)
#upper fence = Q3 + 1.5(Q3-Q1)def getOutliers(column):
q1 = np.quantile(column, 0.25)
q3 = np.quantile(column, 0.75)
interquantile_range = q3-q1
lower_fence = q1 - 1.5*interquantile_range
upper_fence = q3 + 1.5*interquantile_range
outlier_indicies = np.where((column < lower_fence) | (column > upper_fence))[0]
outliers = np.array(column[outlier_indicies])
return outliers, outlier_indiciesoutliers, indexes = getOutliers(demo_df2.income)demo_df3 = demo_df2.drop(indexes)plt.hist(demo_df2.age, bins = 20, edgecolor = "black")(array([ 7., 12., 11., 17., 11., 10., 10., 17., 14., 18., 14., 17., 14.,
6., 7., 6., 5., 2., 1., 1.]),
array([19. , 21.85, 24.7 , 27.55, 30.4 , 33.25, 36.1 , 38.95, 41.8 ,
44.65, 47.5 , 50.35, 53.2 , 56.05, 58.9 , 61.75, 64.6 , 67.45,
70.3 , 73.15, 76. ]),
<BarContainer object of 20 artists>)

plt.hist(demo_df3.income, bins=20, edgecolor="black")(array([11., 24., 37., 12., 18., 18., 8., 9., 9., 9., 6., 2., 2.,
2., 4., 2., 0., 1., 2., 5.]),
array([ 11. , 17.85, 24.7 , 31.55, 38.4 , 45.25, 52.1 , 58.95,
65.8 , 72.65, 79.5 , 86.35, 93.2 , 100.05, 106.9 , 113.75,
120.6 , 127.45, 134.3 , 141.15, 148. ]),
<BarContainer object of 20 artists>)

plt.scatter(demo_df2.age, demo_df2.income)
plt.show()
plt.scatter(demo_df3.age, demo_df3.income)
plt.show()
demo_df2 = demo_df2[demo_df.income<600]demo_df2.isna().sum()age 0
gender 0
marital_status 0
address 0
income 0
income_category 0
job_category 0
dtype: int64
demo_df2.head()| age | gender | marital_status | address | income | income_category | job_category | |
|---|---|---|---|---|---|---|---|
| 0 | 55 | f | 1 | 12 | 72.0 | 3.0 | 3 |
| 1 | 56 | m | 0 | 29 | 153.0 | 4.0 | 3 |
| 2 | 28 | f | no answer | 9 | 28.0 | 2.0 | 1 |
| 3 | 24 | m | 1 | 4 | 26.0 | 2.0 | 1 |
| 4 | 25 | m | no answer | 2 | 23.0 | 1.0 | 2 |
demo_df4 = demo_df2[demo_df2.marital_status != 'no answer'].copy()demo_df4.to_csv('../assets/demo_cleaned.csv', index=False)demo_df4.sample(n=5)| age | gender | marital_status | address | income | income_category | job_category | |
|---|---|---|---|---|---|---|---|
| 119 | 53 | f | 0 | 34 | 136.0 | 4.0 | 3 |
| 6 | 44 | m | 1 | 17 | 144.0 | 4.0 | 3 |
| 80 | 38 | m | 0 | 7 | 42.0 | 2.0 | 1 |
| 76 | 19 | f | 1 | 0 | 13.0 | 1.0 | 1 |
| 59 | 28 | m | 0 | 9 | 28.0 | 2.0 | 2 |
demo_df4.info()<class 'pandas.core.frame.DataFrame'>
Index: 193 entries, 0 to 199
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 193 non-null int64
1 gender 193 non-null object
2 marital_status 193 non-null object
3 address 193 non-null int64
4 income 193 non-null float64
5 income_category 193 non-null float64
6 job_category 193 non-null int64
dtypes: float64(2), int64(3), object(2)
memory usage: 12.1+ KB
demo_df4['marital_status'] = demo_df4.marital_status.astype('int')demo_df5 = demo_df4.copy()demo_df5 = pd.get_dummies(data=demo_df5,
columns=['gender'],
drop_first=True,
dtype='int'
)demo_df5.sample(n=5)| age | marital_status | address | income | income_category | job_category | gender_m | |
|---|---|---|---|---|---|---|---|
| 51 | 48 | 0 | 22 | 109.0 | 4.0 | 2 | 1 |
| 183 | 38 | 1 | 18 | 77.0 | 4.0 | 3 | 0 |
| 85 | 30 | 0 | 4 | 23.0 | 1.0 | 1 | 0 |
| 17 | 21 | 0 | 1 | 37.0 | 2.0 | 1 | 1 |
| 156 | 43 | 1 | 5 | 144.0 | 4.0 | 3 | 1 |
logistic_regression_model = LogisticRegression()X = demo_df5.drop('marital_status', axis=1)
y = demo_df5.marital_statusX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)logistic_regression_model.fit(X, y)LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
logistic_regression_model.score(X, y) * 10054.40414507772021
logistic_regression_model.fit(X_train, y_train)LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
logistic_regression_model.score(X_test, y_test)0.42857142857142855
demo_df2[demo_df2.marital_status == 'no answer']| age | gender | marital_status | address | income | income_category | job_category | |
|---|---|---|---|---|---|---|---|
| 2 | 28 | f | no answer | 9 | 28.0 | 2.0 | 1 |
| 4 | 25 | m | no answer | 2 | 23.0 | 1.0 | 2 |
| 7 | 46 | m | no answer | 20 | 75.0 | 4.0 | 3 |
| 8 | 41 | m | no answer | 10 | 26.0 | 2.0 | 2 |
| 9 | 29 | f | no answer | 4 | 19.0 | 1.0 | 2 |
logistic_regression_model.predict([[28, 9, 28, 2, 1, 0]])/home/jumashafara/venvs/programming_for_data_science/lib/python3.10/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names
warnings.warn(
array([0])
predictions = logistic_regression_model.predict(X_test)# X_test['predicted_marial_status'] = predictionsdecision_tree_classifier = DecisionTreeClassifier()decision_tree_classifier.fit(X_train, y_train)DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
decision_tree_classifier.score(X_test, y_test)0.4897959183673469
decision_tree_classifier.predict(X=[[28, 9, 28, 2, 1, 0]])/home/jumashafara/venvs/programming_for_data_science/lib/python3.10/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
warnings.warn(
array([0])
decision_tree_classifier.predict(X=X_test)array([1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1,
0, 0, 0, 1, 1])
# take in X_test, y_test
# predictions on X_test
# true values ie y_test
# match which are correct
# correct/total